# load the census data
table.builder <- readRDS(file = './EDA/output/sa2_table_builder.rds')
#Missingness in the dataset
# check missingness in the original dataset
sapply(table.builder, function(x) sum(is.na(x)))
## sa2_area total_pop male
## 0 0 3
## female overseas childcare
## 4 4 3
## employ oc_positive oc_negative
## 4 7 7
## sp_male sp_female sp_total
## 4 4 4
## student age0 age10
## 4 0 0
## age20 age30 age40
## 0 0 0
## age50 age60 age70
## 0 0 0
## age80 age90 age100
## 0 0 0
## SA2_MAINCODE_2016 SA2_5DIGITCODE_2016 SA3_CODE_2016
## 2 2 2
## SA3_NAME_2016 SA4_CODE_2016 SA4_NAME_2016
## 2 2 2
## GCC_NAME16 time_to_orange time_to_nowra
## 3 7 7
## time_to_canberra time_to_port time_to_rpa
## 7 7 7
## time_to_westmead shortest_time
## 7 7
#Prepare the Dataset
# Prepare the dataset for modelling
table.builder2 <- table.builder %>%
# create variable for the percentage of the population aged 60 and above
mutate(over60 = (rowSums(select(.,age60:age100)) / total_pop ) * 100 ) %>%
# drop the variables we wont need for the model
select(-c(age0,age10,age20,age30,age40,age50,age60,age70,age80,age90,age100,
SA2_MAINCODE_2016,SA2_5DIGITCODE_2016,SA3_CODE_2016,SA3_NAME_2016,
SA4_CODE_2016,SA4_NAME_2016, time_to_canberra, time_to_orange, time_to_nowra,
time_to_port, time_to_rpa, time_to_westmead)) %>%
# filter rows with no population
filter(total_pop > 0) %>%
# filter rows with missing outcome
filter(!is.na(shortest_time)) %>%
# Filter missing values from overcrowding stat
filter(!is.na(oc_positive)) %>%
# recreate outcome variable as numeric in minutes
mutate(st_min = as.numeric(shortest_time, 'minutes'))
#Distribution of the Outcome
ggplot(data = table.builder2, aes(x = st_min)) +
geom_histogram() +
scale_x_continuous(breaks = seq(0,660,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Function for univariate Exploratory Data Analysis:
# scatter plots for each variable by the shortest time to an achd clinic
scatter.vars <- function(var) {
#Make the plot
ggplot(data = table.builder2, aes_string(x = 'st_min', y = var)) +
geom_point() +
geom_smooth(color = "Blue", se=FALSE, size = 0.5) +
geom_smooth(method = "lm", color = "red", se=FALSE, size = 0.5) +
scale_x_continuous(breaks = seq(0,660,60)) +
ggtitle(var)
}
vars <- c("total_pop","male", "female","overseas","childcare","employ","oc_positive","oc_negative","sp_male", "sp_female","sp_total","student","over60")
#Modelling the entire dataset
##How is each variable related to the outcome?
There seems to be two different relationships occuring between the variables and the outcome. For travel times less than 120 mins, there is a stark change, either positive or negative, in some variables. After this time, however the relationship i smore contast.
- Positive change over the first 120 mins: houses with spare rooms, single parents households, over 60 popultion
- Negative change over the first 120 mins: Total population, percentage born overseas, houses with overcrowding, percentage of students
- No change: percentage of males or females
Unemployement doesnt show much change but the large outliers may be hiding this
lapply(vars, function(x) scatter.vars(x))
## [[1]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[2]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[3]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[4]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[5]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[6]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[7]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[8]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[9]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[10]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[11]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[12]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##
## [[13]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
check unemployment without outliers
ggplot(data = table.builder2, aes(x = st_min, y = employ)) +
geom_point() +
geom_smooth(color = "Blue", se=FALSE, size = 0.5) +
geom_smooth(method = "lm", color = "red", se=FALSE, size = 0.5) +
scale_x_continuous(breaks = seq(0,660,60)) +
scale_y_continuous(limits = c(0,7.5)) +
ggtitle("unemployement")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
##Fit Univariate Linear Models
#functions to fit univariate glms
univ.lm <- function(outcome, variables) {
# list to store output tables
out <- list()
# count to add to list
count <- 1
# loop through the variables
for (i in variables) {
f <- as.formula(paste(outcome, i, sep = '~'))
# fit the glm model and add to list
out[[count]] <- lm(f, data=table.builder2)
#add to the count
count <- count + 1
}
return(out)
}
#fit an univarite model for each variable
uni.mods <- univ.lm("st_min", vars)
# display the coefficients and p values for each univariate model
for (i in 1:length(vars)) {
print(summary(uni.mods[[i]])$coefficients)
}
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 158.078500538 8.6917048406 18.187284 1.438701e-58
## total_pop -0.005240542 0.0005839859 -8.973747 4.181821e-18
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.055076 65.938087 1.5932382 0.1116641
## male -0.316832 1.323258 -0.2394334 0.8108561
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.5954260 58.357306 1.2097102 0.2268942
## female 0.3718028 1.156598 0.3214624 0.7479784
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 189.730903 8.0771513 23.48983 1.000497e-85
## overseas -3.248089 0.2309918 -14.06149 9.626960e-39
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 128.642187 22.737256 5.657771 2.435490e-08
## childcare -1.799417 1.020624 -1.763055 7.842986e-02
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.149086 7.422959 12.952933 8.483542e-34
## employ -2.313486 2.025953 -1.141925 2.539672e-01
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.2387408 19.7988803 1.628311 0.10401437
## oc_positive 0.9801954 0.3317902 2.954263 0.00326424
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 124.289986 5.839790 21.283298 2.543746e-74
## oc_negative -9.352942 1.104309 -8.469496 2.122294e-16
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.58517 11.57320 1.692286 9.114077e-02
## sp_male 91.27459 14.10469 6.471223 2.104604e-10
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.37748 13.453574 3.893202 0.0001107016
## sp_female 11.13407 3.838928 2.900308 0.0038725146
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.31050 13.844817 2.767136 0.0058397265
## sp_total 12.49739 3.223442 3.877033 0.0001181149
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 242.549810 19.5462432 12.409024 1.889368e-31
## student -6.593067 0.8217367 -8.023333 5.940304e-15
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.069214 12.4669534 -1.609793 1.079996e-01
## over60 4.650473 0.5008792 9.284621 3.421759e-19
##Build a model with the variables that are significant in the univariate models
sa2.lm <- lm(st_min ~ total_pop + overseas + childcare + employ + oc_positive +
oc_negative + sp_male + sp_female + student + over60,
data = table.builder2)
summary(sa2.lm)
##
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + employ +
## oc_positive + oc_negative + sp_male + sp_female + student +
## over60, data = table.builder2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -297.05 -46.14 -16.01 21.18 488.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.156e+02 4.208e+01 7.500 2.53e-13 ***
## total_pop -1.667e-03 6.272e-04 -2.657 0.008108 **
## overseas -3.931e+00 4.374e-01 -8.987 < 2e-16 ***
## childcare -4.620e+00 1.299e+00 -3.556 0.000409 ***
## employ -1.972e+00 1.939e+00 -1.017 0.309696
## oc_positive -3.093e-01 5.380e-01 -0.575 0.565563
## oc_negative 4.511e+00 1.862e+00 2.423 0.015726 *
## sp_male 1.951e+01 1.670e+01 1.169 0.243028
## sp_female 3.671e+00 4.735e+00 0.775 0.438426
## student -4.700e-01 1.187e+00 -0.396 0.692415
## over60 3.804e-01 6.606e-01 0.576 0.565015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 85.12 on 558 degrees of freedom
## Multiple R-squared: 0.347, Adjusted R-squared: 0.3353
## F-statistic: 29.66 on 10 and 558 DF, p-value: < 2.2e-16
##Build another model with the sigificant variables from sa2.lm
reduced.sa2.lm <- lm(st_min ~ total_pop + overseas + childcare + oc_negative, data = table.builder2)
summary(reduced.sa2.lm)
##
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + oc_negative,
## data = table.builder2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -322.71 -46.60 -15.32 17.66 497.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.413e+02 2.294e+01 14.879 < 2e-16 ***
## total_pop -1.772e-03 5.909e-04 -2.999 0.002827 **
## overseas -4.341e+00 3.743e-01 -11.600 < 2e-16 ***
## childcare -5.263e+00 9.211e-01 -5.714 1.79e-08 ***
## oc_negative 5.506e+00 1.559e+00 3.532 0.000446 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 85.29 on 564 degrees of freedom
## Multiple R-squared: 0.3375, Adjusted R-squared: 0.3328
## F-statistic: 71.82 on 4 and 564 DF, p-value: < 2.2e-16
anova(reduced.sa2.lm, sa2.lm)
## Analysis of Variance Table
##
## Model 1: st_min ~ total_pop + overseas + childcare + oc_negative
## Model 2: st_min ~ total_pop + overseas + childcare + employ + oc_positive +
## oc_negative + sp_male + sp_female + student + over60
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 564 4102560
## 2 558 4043203 6 59358 1.3653 0.2265
##Model Diagnositcs Checking the residuals - homoskedacisty
# load the olsrr library
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
# residuals versus fitted values plot
ols_plot_resid_fit(reduced.sa2.lm)
# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(reduced.sa2.lm, rhs=TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------------------------
## Response : st_min
## Variables: total_pop overseas childcare oc_negative
##
## Test Summary
## -------------------------------
## DF = 4
## Chi2 = 251.3026
## Prob > Chi2 = 3.411563e-53
table.builder2$residuals <- residuals(reduced.sa2.lm)
ggplot(data = table.builder2, aes(x = st_min, y = residuals)) +
geom_point() +
scale_x_continuous(breaks = seq(0,660,60)) +
ggtitle("Residuals vs Outcome")
ggplot(data = table.builder2, aes(x = total_pop, y = residuals)) +
geom_point() +
ggtitle("Residuals vs Total Population")
ggplot(data = table.builder2, aes(x = overseas, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of population born overseas")
ggplot(data = table.builder2, aes(x = childcare, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of population in chlidcare")
ggplot(data = table.builder2, aes(x = oc_negative, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of overcrowded households")
Checking the residuals - normality
qqnorm(residuals(reduced.sa2.lm), ylab="Residuals")
qqline(residuals(reduced.sa2.lm))
hist(residuals(reduced.sa2.lm), breaks = 50)
shapiro.test(residuals(reduced.sa2.lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(reduced.sa2.lm)
## W = 0.84545, p-value < 2.2e-16
reject the null hypothesis that the residuals are normal
Checking for influentiak Observations
library(faraway)
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
##
## Attaching package: 'faraway'
## The following object is masked from 'package:olsrr':
##
## hsb
faraway::halfnorm(cooks.distance(reduced.sa2.lm), nlab = 1, ylab="cooks distance", lab = table.builder2$sa2_area)
# refit the model without the port botany industrial area
reduced.sa2.lm.2 <- lm(st_min ~ total_pop + overseas + childcare + oc_negative, data = table.builder2, subset = (cooks.distance(reduced.sa2.lm) < 0.02))
summary(reduced.sa2.lm.2)
##
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + oc_negative,
## data = table.builder2, subset = (cooks.distance(reduced.sa2.lm) <
## 0.02))
##
## Residuals:
## Min 1Q Median 3Q Max
## -175.67 -44.82 -13.55 20.22 359.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.771e+02 2.336e+01 16.145 < 2e-16 ***
## total_pop -1.781e-03 5.359e-04 -3.323 0.000949 ***
## overseas -4.262e+00 3.438e-01 -12.395 < 2e-16 ***
## childcare -6.899e+00 9.326e-01 -7.398 5.12e-13 ***
## oc_negative 4.665e+00 1.424e+00 3.276 0.001118 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77.06 on 557 degrees of freedom
## Multiple R-squared: 0.3823, Adjusted R-squared: 0.3778
## F-statistic: 86.18 on 4 and 557 DF, p-value: < 2.2e-16
# residuals versus fitted values plot
ols_plot_resid_fit(reduced.sa2.lm.2)
# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(reduced.sa2.lm.2, rhs=TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------------------------
## Response : st_min
## Variables: total_pop overseas childcare oc_negative
##
## Test Summary
## -------------------------------
## DF = 4
## Chi2 = 156.8503
## Prob > Chi2 = 6.923801e-33
This model breaks a fair few assumptions, constant variance and normality of the residuals. Also the Residual standard error is quite high and the R2 quite low. suggesting a poor fit. The initial EDA also suggested the data was not linear and the spread of the outcome is not normal. It might be that the sydney population and rural population have very different circumstance. Perhaps sepaerating them would help.
Also, judging by the relationship between the residuals and the predictors, some of the very low population areas might be causing an issue for the model fit, so we can try remove areas with population less than 100
#Distribution of the Outcome separated by GCC Name
table.builder2$GCC_NAME16 <- as.factor(table.builder2$GCC_NAME16)
table.builder.syd <- table.builder2 %>% filter(GCC_NAME16 == "Greater Sydney") %>% filter(total_pop > 100)
table.builder.nsw <- table.builder2 %>% filter(GCC_NAME16 == "Rest of NSW") %>% filter(total_pop > 100)
ggplot(data = table.builder.syd, aes(x = st_min)) +
geom_histogram() +
scale_x_continuous(breaks = seq(0,660,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = table.builder.nsw, aes(x = st_min)) +
geom_histogram() +
scale_x_continuous(breaks = seq(0,660,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Function for univariate Exploratory Data Analysis, separated by urban/regional SA2 areas:
# scatter plots for each variable by the shortest time to an achd clinic
scatter.vars.grouped <- function(var) {
#plot the boxplot
ggplot(data = table.builder2, aes_string(x = 'st_min', y = var, color = 'GCC_NAME16')) +
geom_point(size = 0.25) +
geom_smooth(method = 'lm', size = 1.25) +
scale_x_continuous(breaks = seq(0,660,60)) +
ggtitle(var)
}
vars <- c("total_pop","male", "female","overseas","childcare","employ","oc_positive","oc_negative","sp_male", "sp_female","sp_total","student","over60")
lapply(vars, function(x) scatter.vars.grouped(x))
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
#Modelling the Sydney dataset
##Fit Univariate Linear Models
#functions to fit univariate regression models
univ.lm <- function(outcome, variables) {
# list to store output tables
out <- list()
# count to add to list
count <- 1
# loop through the variables
for (i in variables) {
f <- as.formula(paste(outcome, i, sep = '~'))
# fit the glm model and add to list
out[[count]] <- lm(f, data=table.builder.syd)
#add to the count
count <- count + 1
}
return(out)
}
#fit an univarite model for each variable
uni.mods <- univ.lm("st_min", vars)
# display the coefficients and p values for each univariate model
for (i in 1:length(vars)) {
print(summary(uni.mods[[i]])$coefficients)
}
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.041926293 2.9772870656 15.464389 5.972558e-40
## total_pop -0.001128612 0.0001702919 -6.627511 1.612969e-10
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121.375688 36.3150285 3.342299 0.0009376152
## male -1.899057 0.7363815 -2.578904 0.0103942361
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68.316964 37.3913563 -1.827079 0.06869473
## female 1.894836 0.7369582 2.571158 0.01062433
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.318104 2.48680649 26.26586 2.552974e-79
## overseas -0.917315 0.05683346 -16.14040 1.778215e-42
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.851210 6.7860564 -2.041128 4.212482e-02
## childcare 1.850531 0.2974519 6.221277 1.679462e-09
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.19824 3.879058 10.36289 1.130627e-21
## employ -4.21053 1.251856 -3.36343 8.711515e-04
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.4877389 4.84618456 -2.164123 3.125448e-02
## oc_positive 0.6698549 0.08266751 8.103001 1.422453e-14
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.666989 1.7100152 22.61207 1.878443e-66
## oc_negative -2.006895 0.2431683 -8.25311 5.150880e-15
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.950695 4.014545 -1.233190 2.184835e-01
## sp_male 47.995684 5.670231 8.464503 1.209079e-15
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.18286 3.649637 3.612103 3.566871e-04
## sp_female 4.48312 1.062118 4.220924 3.240008e-05
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.04424 3.7920787 2.648742 8.512744e-03
## sp_total 4.50395 0.9172225 4.910422 1.505024e-06
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.830494 7.9389335 7.284416 2.942134e-12
## student -1.177976 0.3076969 -3.828365 1.574436e-04
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.821078 3.7569606 -2.614102 9.403914e-03
## over60 1.920408 0.1845368 10.406640 8.079510e-22
##Build a model with the variables that are significant in the univariate models
significant variables in univariate model: total_pop, male, female, overseas, childcare, employ, oc_positive, oc_negative, sp_male, sp_female, sp_total, student, over_60
sa2.lm <- lm(st_min ~ total_pop + male + female + overseas + childcare + employ + oc_positive +
oc_negative + sp_male + sp_female + sp_total+ student + over60,
data = table.builder.syd)
summary(sa2.lm)
##
## Call:
## lm(formula = st_min ~ total_pop + male + female + overseas +
## childcare + employ + oc_positive + oc_negative + sp_male +
## sp_female + sp_total + student + over60, data = table.builder.syd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.559 -8.637 -0.813 7.704 37.865
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.543e+03 2.109e+03 -0.731 0.46508
## total_pop -3.484e-04 1.165e-04 -2.991 0.00302 **
## male 1.613e+01 2.110e+01 0.765 0.44517
## female 1.528e+01 2.108e+01 0.725 0.46893
## overseas -1.200e+00 1.064e-01 -11.275 < 2e-16 ***
## childcare 8.475e-01 3.601e-01 2.353 0.01928 *
## employ 1.372e+01 1.797e+00 7.637 3.39e-13 ***
## oc_positive -2.778e-01 1.384e-01 -2.007 0.04572 *
## oc_negative -3.133e-01 4.314e-01 -0.726 0.46827
## sp_male 4.034e+00 8.171e+00 0.494 0.62194
## sp_female -2.297e+00 1.530e+00 -1.502 0.13432
## sp_total NA NA NA NA
## student -1.505e-01 2.890e-01 -0.521 0.60298
## over60 1.111e+00 1.831e-01 6.066 4.15e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.06 on 285 degrees of freedom
## Multiple R-squared: 0.677, Adjusted R-squared: 0.6634
## F-statistic: 49.78 on 12 and 285 DF, p-value: < 2.2e-16
##Build another model with the sigificant variables from sa2.lm
reduced.sa2.lm <- lm(st_min ~ total_pop + overseas + childcare + employ + oc_positive + over60, data = table.builder.syd)
summary(reduced.sa2.lm)
##
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + employ +
## oc_positive + over60, data = table.builder.syd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.329 -8.640 -0.591 7.584 39.434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.0429430 9.5181992 3.682 0.000276 ***
## total_pop -0.0004153 0.0001136 -3.656 0.000304 ***
## overseas -1.2178819 0.0797015 -15.281 < 2e-16 ***
## childcare 0.6534680 0.3356037 1.947 0.052480 .
## employ 11.1252769 1.0288700 10.813 < 2e-16 ***
## oc_positive -0.3006557 0.1073464 -2.801 0.005439 **
## over60 0.9676831 0.1515301 6.386 6.72e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.13 on 291 degrees of freedom
## Multiple R-squared: 0.6661, Adjusted R-squared: 0.6592
## F-statistic: 96.76 on 6 and 291 DF, p-value: < 2.2e-16
anova(reduced.sa2.lm, sa2.lm)
## Analysis of Variance Table
##
## Model 1: st_min ~ total_pop + overseas + childcare + employ + oc_positive +
## over60
## Model 2: st_min ~ total_pop + male + female + overseas + childcare + employ +
## oc_positive + oc_negative + sp_male + sp_female + sp_total +
## student + over60
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 291 42818
## 2 285 41423 6 1394.8 1.5995 0.1471
##Build a further reduced model
reduced.sa2.lm.2 <- lm(st_min ~ total_pop + overseas + employ + over60, data = table.builder.syd)
summary(reduced.sa2.lm.2)
##
## Call:
## lm(formula = st_min ~ total_pop + overseas + employ + over60,
## data = table.builder.syd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.213 -8.363 -1.088 7.142 40.711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.4417669 5.1305418 5.933 8.36e-09 ***
## total_pop -0.0003898 0.0001143 -3.411 0.000738 ***
## overseas -1.1370888 0.0672525 -16.908 < 2e-16 ***
## employ 10.9293422 1.0107852 10.813 < 2e-16 ***
## over60 0.9159013 0.1503025 6.094 3.47e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.25 on 293 degrees of freedom
## Multiple R-squared: 0.6571, Adjusted R-squared: 0.6524
## F-statistic: 140.4 on 4 and 293 DF, p-value: < 2.2e-16
anova(reduced.sa2.lm.2, sa2.lm)
## Analysis of Variance Table
##
## Model 1: st_min ~ total_pop + overseas + employ + over60
## Model 2: st_min ~ total_pop + male + female + overseas + childcare + employ +
## oc_positive + oc_negative + sp_male + sp_female + sp_total +
## student + over60
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 293 43972
## 2 285 41423 8 2549.2 2.1924 0.02811 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Analysis of variance suggests that reducing the model further does help to imporve the residual errors, so we will go with the second reduced model.
reduced.sa2.lm <- lm(st_min ~ total_pop + overseas + employ + oc_positive + over60, data = table.builder.syd)
##Model Diagnoistics
# residuals versus fitted values plot
ols_plot_resid_fit(reduced.sa2.lm.2)
# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(reduced.sa2.lm.2, rhs=TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -------------------------------------------
## Response : st_min
## Variables: total_pop overseas employ over60
##
## Test Summary
## -------------------------------
## DF = 4
## Chi2 = 26.86737
## Prob > Chi2 = 2.114471e-05
table.builder.syd$residuals <- residuals(reduced.sa2.lm.2)
ggplot(data = table.builder.syd, aes(x = st_min, y = residuals)) +
geom_point() +
scale_x_continuous(breaks = seq(0,660,60)) +
ggtitle("Residuals vs Outcome")
ggplot(data = table.builder.syd, aes(x = total_pop, y = residuals)) +
geom_point() +
ggtitle("Residuals vs Total Population")
ggplot(data = table.builder.syd, aes(x = overseas, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of population born overseas")
ggplot(data = table.builder.syd, aes(x = childcare, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of population engaged in childcare")
ggplot(data = table.builder.syd, aes(x = employ, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of population unemployed")
ggplot(data = table.builder.syd, aes(x = oc_positive, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of undercrowded households")
ggplot(data = table.builder.syd, aes(x = over60, y = residuals)) +
geom_point() +
ggtitle("Residuals vs % of population over 60 years")
# test model with transformation in 'overseas'
lm.test <- lm(st_min ~ total_pop + I(log(overseas)) + employ + oc_positive + over60, data = table.builder.syd)
summary(lm.test)
##
## Call:
## lm(formula = st_min ~ total_pop + I(log(overseas)) + employ +
## oc_positive + over60, data = table.builder.syd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.238 -7.639 -0.150 6.614 36.480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.505e+02 1.229e+01 12.245 < 2e-16 ***
## total_pop -3.081e-04 1.076e-04 -2.864 0.00448 **
## I(log(overseas)) -4.330e+01 2.482e+00 -17.448 < 2e-16 ***
## employ 9.136e+00 9.075e-01 10.067 < 2e-16 ***
## oc_positive -1.043e-01 7.063e-02 -1.476 0.14094
## over60 9.453e-01 1.393e-01 6.786 6.42e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.45 on 292 degrees of freedom
## Multiple R-squared: 0.7014, Adjusted R-squared: 0.6963
## F-statistic: 137.2 on 5 and 292 DF, p-value: < 2.2e-16
# residuals versus fitted values plot
ols_plot_resid_fit(lm.test)
# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(lm.test, rhs=TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------------------------------------
## Response : st_min
## Variables: total_pop I(log(overseas)) employ oc_positive over60
##
## Test Summary
## -------------------------------
## DF = 5
## Chi2 = 62.9082
## Prob > Chi2 = 3.041438e-12
plot(table.builder.syd$overseas, residuals(lm.test))
#Modelling the Rest of NSW dataset
##Fit Univariate Linear Models
#functions to fit univariate regression models
univ.lm <- function(outcome, variables) {
# list to store output tables
out <- list()
# count to add to list
count <- 1
# loop through the variables
for (i in variables) {
f <- as.formula(paste(outcome, i, sep = '~'))
# fit the glm model and add to list
out[[count]] <- lm(f, data=table.builder.nsw)
#add to the count
count <- count + 1
}
return(out)
}
#fit an univarite model for each variable
uni.mods <- univ.lm("st_min", vars)
# display the coefficients and p values for each univariate model
for (i in 1:length(vars)) {
print(summary(uni.mods[[i]])$coefficients)
}
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 198.599210086 15.142710179 13.115169 2.097946e-30
## total_pop -0.003558164 0.001300985 -2.734977 6.674453e-03
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -294.19589 249.711212 -1.178144 0.23983300
## male 9.21502 5.040391 1.828235 0.06867794
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 617.101750 255.098726 2.419070 0.01625803
## female -9.012787 5.051497 -1.784181 0.07557849
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 199.490511 29.47895 6.767218 8.900618e-11
## overseas -1.959859 1.49923 -1.307244 1.923029e-01
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 349.395280 51.755581 6.750872 9.794638e-11
## childcare -8.650584 2.368331 -3.652607 3.147944e-04
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 167.311910 28.767559 5.8159927 1.791002e-08
## employ -1.817301 9.783957 -0.1857429 8.527934e-01
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 370.926687 57.8782714 6.408738 7.007389e-10
## oc_positive -3.408254 0.9376716 -3.634806 3.361916e-04
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 163.1853599 19.548346 8.34778336 4.393029e-15
## oc_negative -0.5264033 9.159154 -0.05747292 9.542133e-01
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120.82891 34.41101 3.511345 0.0005267703
## sp_male 47.09133 38.33828 1.228311 0.2204583679
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 161.82235870 28.881005 5.60307219 5.431385e-08
## sp_female 0.09066583 7.905044 0.01146936 9.908579e-01
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 154.850076 31.925134 4.8504127 2.14137e-06
## sp_total 1.652748 7.044041 0.2346306 8.14683e-01
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 268.496841 40.943106 6.557803 2.998383e-10
## student -4.985907 1.889582 -2.638630 8.834410e-03
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176.1904125 29.267957 6.0199081 6.021070e-09
## over60 -0.5005823 1.010131 -0.4955618 6.206287e-01
##Build a model with the variables that are significant in the univariate models
significant variables in univariate model: total_pop, childcare, oc_positive, student
sa2.lm <- lm(st_min ~ total_pop + childcare + oc_positive + student, data = table.builder.nsw)
summary(sa2.lm)
##
## Call:
## lm(formula = st_min ~ total_pop + childcare + oc_positive + student,
## data = table.builder.nsw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -198.84 -85.30 -23.41 63.64 491.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 386.991930 61.913748 6.251 1.73e-09 ***
## total_pop -0.001948 0.001421 -1.371 0.171
## childcare -4.958657 3.578752 -1.386 0.167
## oc_positive -1.597960 1.277203 -1.251 0.212
## student 0.015862 2.476937 0.006 0.995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 113.8 on 253 degrees of freedom
## Multiple R-squared: 0.06686, Adjusted R-squared: 0.05211
## F-statistic: 4.532 on 4 and 253 DF, p-value: 0.001492
Perhaps this suggests that there is not much relationship between the outcome and predictors?
##Model Diagnositics
# residuals versus fitted values plot
ols_plot_resid_fit(sa2.lm)
# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(sa2.lm, rhs=TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## --------------------------------------------------
## Response : st_min
## Variables: total_pop childcare oc_positive student
##
## Test Summary
## ------------------------------
## DF = 4
## Chi2 = 32.01021
## Prob > Chi2 = 1.90393e-06